Natural Language Processing (NLP) for:
Harry Potter and the Philosopher’s Stone (1997)
Harry Potter and the Chamber of Secrets (1998)
Harry Potter and the Prisoner of Azkaban (1999)
Harry Potter and the Goblet of Fire (2000)
Harry Potter and the Order of the Phoenix (2003)
Harry Potter and the Half-Blood Prince (2005)
Harry Potter and the Deathly Hallows (2007)

Bottom Line Up Front

Setup

rm(list = ls())
gc()


# install.packages('pacman') install.packages('remotes') library(remotes)
# install.packages('devtools') library(devtools)
# remotes::install_github('nrguimaraes/sentimentSetsR') library(magrittr)
# devtools::install_github('wch/webshot') webshot::install_phantomjs()
# library(BiocManager)
# BiocManager::install('https://bioconductor.org/biocLite.R')
# source('https://bioconductor.org/biocLite.R') biocLite('EBImage')
library(pacman)
pacman::p_load(devtools, knitr, magrittr, dplyr, ggplot2, rvest, sentimentSetsR, 
    caret, textTinyR, text2vec, tm, tidytext, stringr, stringi, SnowballC, stopwords, 
    wordcloud, prettydoc, cowplot, kable, utf8, corpus, glue, topicmodels, stm, wordcloud2, 
    htmlwidgets, viridis)

knitr::opts_chunk$set(echo = TRUE, message = FALSE, comment = NA, warning = FALSE, 
    tidy = TRUE, results = "hold", cache = FALSE, dpi = 120, fig.width = 8, fig.height = 6)

# Parameters N-gram
ngrams <- "single words"
ngram <- c(1, 1)

## Number of Topics
K <- 4

# Custom functions Remove quotation marks
pasteNQ <- function(...) {
    output <- paste(...)
    noquote(output)
}
pasteNQ0 <- function(...) {
    output <- paste0(...)
    noquote(output)
}

# Proportions
propTab <- function(data, exclude = NULL, useNA = "no", dnn = NULL, deparse.level = 1, 
    digits = 0) {
    round(table(data, exclude = exclude, useNA = useNA, dnn = dnn, deparse.level = deparse.level)/NROW(data) * 
        100, digits)
}


## Chart Template
Grph_theme <- function() {
    palette <- brewer.pal("Greys", n = 9)
    color.background = palette[2]
    color.grid.major = palette[3]
    color.axis.text = palette[6]
    color.axis.title = palette[7]
    color.title = palette[9]
    theme_bw(base_size = 9) + theme(panel.background = element_rect(fill = color.background, 
        color = color.background)) + theme(plot.background = element_rect(fill = color.background, 
        color = color.background)) + theme(panel.border = element_rect(color = color.background)) + 
        theme(panel.grid.major = element_line(color = color.grid.major, size = 0.25)) + 
        theme(panel.grid.minor = element_blank()) + theme(axis.ticks = element_blank()) + 
        theme(legend.position = "none") + theme(legend.title = element_text(size = 11, 
        color = "black")) + theme(legend.background = element_rect(fill = color.background)) + 
        theme(legend.text = element_text(size = 9, color = "black")) + theme(strip.text.x = element_text(size = 9, 
        color = "black", vjust = 1)) + theme(plot.title = element_text(color = color.title, 
        size = 20, vjust = 1.25)) + theme(axis.text.x = element_text(size = 9, color = "black")) + 
        theme(axis.text.y = element_text(size = 9, color = "black")) + theme(axis.title.x = element_text(size = 10, 
        color = "black", vjust = 0)) + theme(axis.title.y = element_text(size = 10, 
        color = "black", vjust = 1.25)) + theme(plot.margin = unit(c(0.35, 0.2, 0.3, 
        0.35), "cm"))
}

## Clean Corpus
basicclean <- function(rawtext) {
    # Set to lowercase
    rawtext <- tolower(rawtext)
    print(pasteNQ("Set to lowercase"))
    
    # Remove contractions
    fix_contractions <- function(rawtext) {
        rawtext <- gsub("will not", "won't", rawtext)
        rawtext <- gsub("can't", "can not", rawtext)
        rawtext <- gsub("can not", "cannot", rawtext)
        rawtext <- gsub("shant", "shall not", rawtext)
        rawtext <- gsub("n't", " not", rawtext)
        rawtext <- gsub("'ll", " will", rawtext)
        rawtext <- gsub("'re", " are", rawtext)
        rawtext <- gsub("'ve", " have", rawtext)
        rawtext <- gsub("'m", " am", rawtext)
        rawtext <- gsub("'d", " would", rawtext)
        rawtext <- gsub("'ld", " would", rawtext)
        rawtext <- gsub("'ld", " would", rawtext)
        rawtext <- gsub("'s", "", rawtext)
        return(rawtext)
    }
    rawtext <- fix_contractions(rawtext)
    print(pasteNQ("Fixed contractions"))
    
    # Strip whitespace
    rawtext <- stripWhitespace(rawtext)
    print(pasteNQ("Stripped whitespace"))
    
    return(rawtext)
}

# Remove stop words
removestopwords <- function(rawtext, remove = NULL, retain = NULL) {
    
    # Remove stop words
    stopwords_custom <- stopwords::stopwords("en", source = "snowball")
    stopwords_custom <- c(stopwords_custom, remove)
    stopwords_retain <- retain
    stopwords_custom <- stopwords_custom[!stopwords_custom %in% stopwords_retain]
    rawtext <- removeWords(rawtext, stopwords_custom)
    print(pasteNQ("Removed", length(stopwords_custom), "stop words"))
    
    return(rawtext)
}

## Word Stemming
wordstem <- function(rawtext) {
    # Stemming words
    rawtext <- stemDocument(rawtext)
    print(pasteNQ("Stemmed words"))
    
    return(rawtext)
}

## Remove Non-Alpha
removenonalpha <- function(rawtext) {
    # Remove puncutation, numbers, and other none characters
    rawtext <- removePunctuation(rawtext)
    rawtext <- removeNumbers(rawtext)
    rawtext <- gsub("[^[:alnum:]///' ]", "", rawtext)
    rawtext <- gsub("[']", "", rawtext)
    print(pasteNQ("Removed punctuation, numbers, and other none characters"))
    
    return(rawtext)
}

# Remove JavaScript from WordClouds
library("EBImage")
embed_htmlwidget <- function(widget, rasterise = T) {
    outputFormat = knitr::opts_knit$get("rmarkdown.pandoc.to")
    if (rasterise || outputFormat == "latex") {
        html.file = tempfile("tp", fileext = ".html")
        png.file = tempfile("tp", fileext = ".png")
        
        htmlwidgets::saveWidget(widget, html.file, selfcontained = FALSE)
        webshot::webshot(html.file, file = png.file, vwidth = 700, vheight = 500, 
            delay = 10)
        img = EBImage::readImage(png.file)
        EBImage::display(img)
    } else {
        widget
    }
}

Load Data

setwd("C:/Users/siebe/Documents/07_Books/Harry Potter/")

titles <- c("Philosopher's Stone", "Chamber of Secrets", "Prisoner of Azkaban", "Goblet of Fire", 
    "Order of the Phoenix", "Half-Blood Prince", "Deathly Hallows")
html <- c("Harry_Potter_and_the_Philosophers_Stone.html", "Harry_Potter_and_the_Chamber_of_Secrets.html", 
    "Harry_Potter_and_the_Prisoner_of_Azkaban.html", "Harry_Potter_and_the_Goblet_of_Fire.html", 
    "Harry_Potter_and_the_Order_of_the_Phoenix.html", "Harry_Potter_and_the_Half-Blood_Prince.html", 
    "Harry_Potter_and_the_Deathly_Hallows.html")

books <- tibble(Text = as.character(), Book = as.character())
para3 <- tibble(Text = as.character(), Book = as.character())

for (i in 1:7) {
    rawtext <- read_html(html[i]) %>% html_nodes(xpath = "/html/body/p") %>% html_text(trim = TRUE)
    
    wordcount <- sapply(strsplit(rawtext, " "), length)
    paragraph <- rawtext[wordcount >= 3]
    
    # Book Level Documents
    books <- rbind(books, tibble(Text = str_c(paragraph, collapse = " "), Book = titles[i]))
    
    # Paragraph Level Documents
    triplet <- do.call(rbind, lapply(seq(1, length(paragraph), by = 3), function(x) tibble(Text = str_c(paragraph[x:(x + 
        2)], collapse = " "), Book = titles[i])))
    para3 <- rbind(para3, triplet)
}

# Page Level Documents
pages <- books %>% unnest_tokens(Text, Text, strip_punct = T, to_lower = F) %>% group_by(Book, 
    Page = dplyr::row_number()%/%250) %>% dplyr::summarize(Text = stringr::str_c(Text, 
    collapse = " ")) %>% mutate(Page = dplyr::row_number()) %>% ungroup()

# Place books in chronolgical order
books$Book <- factor(books$Book, levels = titles)
para3$Book <- factor(para3$Book, levels = titles)
pages$Book <- factor(pages$Book, levels = titles)

Sentiment at the Paragraph Level

For this sentiment analysis, I want to grab the text around certain characters. Chapters and pages are too much text as they can contain multiple story points, and sentences are too little text as they likely contain little contextual information.

Instead, I plan to take paragraphs as the documents for my corpus. However, paragraphs can be single sentences as in the case of two characters switch dialogue. Therefore, I take paragraph triplets: three paragraphs containing three or more words.

Word Distributions

I start checking for normal (or “normal-ish”) distributions of words per paragraph triplet to make sure there is some consistency in document length. The word distributions appear to approximate a normal distribution, although with a non-trivial right tail. In addition, the word distributions are similar across books, making it a comparable level of analysis.

para3$Wordcount <- sapply(strsplit(para3$Text %>% as.character(), " "), length)

pasteNQ0("Average Amount of Words per Paragraph Triplet")
summary(para3$Wordcount)

# Graph distribution of words all
ggplot(para3, aes(Wordcount, fill = I("#7F0909"))) + geom_histogram() + stat_bin(bins = 100) + 
    Grph_theme() + ylab("Frequency") + xlab("Count of Words") + ggtitle("Words per Paragraph Triplet")

# Graph with book fill
ggplot(para3, aes(Wordcount, fill = Book)) + geom_area(stat = "bin") + scale_fill_manual(values = c("#946B2D", 
    "#0D6217", "#000A90", "#AAAAAA", "#000000", "#7F0909", "#FFC500")) + Grph_theme() + 
    ylab("Frequency") + xlab("Count of Words") + ggtitle("Words per Paragraph Triplet") + 
    theme(legend.text = element_text(size = 8, color = "black")) + theme(legend.position = "bottom")

[1] Average Amount of Words per Paragraph Triplet
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   52.00   77.00   88.54  111.00  530.00 

Heroes and Villians

I create three character dictionaries to compare against each other. The first contains our main trio, at least one of whom is mentioned in

# Create Dictionaries
trio <- c("harry", "ron", "hermione")
pasteNQ("Main Characters:")
paste(trio)
cat("\n")

heroes <- c("lily", "james", "hagrid", "dumbledore", "sirius", "lupin", "moody", 
    "slughorn", "dobby", "cedric", "luna", "tonks", "mcgonagall", "ginny", "order of the phoenix", 
    "neville")
pasteNQ("Number of Heroes:", length(heroes))
pasteNQ("Including:")
paste(heroes)
cat("\n")

villians <- c("snape", "draco", "lucius", "umbridge", "voldemort", "quirrell", "pettigrew", 
    "dementor", "greyback", "bellatrix", "dudley", "vernon", "petunia", "riddle", 
    "death eaters", "aragog", "basilisk", "nagini", "dementors")
pasteNQ("Number of Villians:", length(villians))
pasteNQ("Including:")
paste(villians)
[1] Main Characters:
[1] "harry"    "ron"      "hermione"

[1] Number of Heroes: 16
[1] Including:
 [1] "lily"                 "james"                "hagrid"              
 [4] "dumbledore"           "sirius"               "lupin"               
 [7] "moody"                "slughorn"             "dobby"               
[10] "cedric"               "luna"                 "tonks"               
[13] "mcgonagall"           "ginny"                "order of the phoenix"
[16] "neville"             

[1] Number of Villians: 19
[1] Including:
 [1] "snape"        "draco"        "lucius"       "umbridge"     "voldemort"   
 [6] "quirrell"     "pettigrew"    "dementor"     "greyback"     "bellatrix"   
[11] "dudley"       "vernon"       "petunia"      "riddle"       "death eaters"
[16] "aragog"       "basilisk"     "nagini"       "dementors"   

Sentiment Scores

# Sentiment Scores
average <- function(x) {
    pos <- sum(x[x > 0], na.rm = T)
    neg <- sum(x[x < 0], na.rm = T) %>% abs()
    neu <- length(x[x == 0])
    bal <- ((pos - neg)/(pos + neg)) * 100
    y <- ifelse(is.nan(bal), 0, bal %>% as.numeric())
    return(y)
}
CleanText <- basicclean(para3$Text)
para3$Score <- sapply(CleanText, function(x) getSentiment(x, dictionary = "vader", 
    score.type = average))
para3$Cross <- NA
para3$Cross <- ifelse(para3$Score < 0, "Negative", para3$Cross)
para3$Cross <- ifelse(para3$Score > 0, "Positive", para3$Cross)
para3$Cross <- ifelse(para3$Score == 0, "Neutral", para3$Cross)
cat("\n")
pasteNQ("Sentiment Levels")
propTab(para3$Cross)
[1] Set to lowercase
[1] Fixed contractions
[1] Stripped whitespace

[1] Sentiment Levels
Negative  Neutral Positive     <NA> 
      43       10       47        0 

Graph Sentiment of Main Trio

# Subset text mentioning a main character
trio_sent <- data.frame(Trio = NA, Sentiment = NA)

# Find mean of sentiment scores for each main character and create ID of all trio
# for total row
trio_ID <- c()
for (string in trio) {
    findstr <- paste0("\\b", string, "\\b")
    trio_sent <- rbind(trio_sent, c(string, para3 %>% as.data.frame() %>% .[grepl(findstr, 
        CleanText), "Score"] %>% average() %>% as.numeric()))
    trio_ID <- c(trio_ID, grep(findstr, CleanText))
}

# Sort data
trio_sent <- trio_sent %>% .[order(trio_sent$Sentiment %>% as.numeric()), ] %>% tidyr::drop_na()
max_sent <- max(trio_sent$Sentiment %>% as.numeric() %>% abs() + 1)

# Add main character total
trio_ID <- unique(trio_ID)
trio_sent <- rbind(trio_sent, c("average of trio", para3 %>% as.data.frame() %>% 
    .[trio_ID, "Score"] %>% average() %>% as.numeric()))

# Graph
trio_sent %>% mutate(Trio = factor(Trio, levels = Trio)) %>% ggplot(aes(x = Trio, 
    y = Sentiment %>% as.numeric() %>% round(1), fill = Trio)) + geom_col() + coord_flip() + 
    Grph_theme() + ylab("Sentiment (-100% to 100%)") + xlab("") + ggtitle("Sentiment of trio") + 
    ylim(max_sent * -1, max_sent) + scale_fill_manual(values = c("#FFC500", "#AAAAAA", 
    "#7F0909", "black")) + guides(fill = FALSE)

Graph Sentiment of Heroes

Surprisingly, Professor Slughorn has the most positive text around him. This perhaps is because Harry for the first time enjoyed and excelled in potions with him as teacher. Additionally, Harry spends a non-trival amount of time under the Felix Felicis potion while with Professor Slughorn.

Fan favorite, Luna Lovegood, has the second most positive text around her. This is not surprising as she has a very positive-minded personality.

Professor Moody

# Subset text mentioning a hero
heroes_sent <- data.frame(Hero = NA, Sentiment = NA)

# Find mean of sentiment scores for each hero and create ID of all heroes for
# total row
heroes_ID <- c()
for (string in heroes) {
    findstr <- paste0("\\b", string, "\\b")
    heroes_sent <- rbind(heroes_sent, c(string, para3 %>% as.data.frame() %>% .[grepl(findstr, 
        CleanText), "Score"] %>% average() %>% as.numeric()))
    heroes_ID <- c(heroes_ID, grep(findstr, CleanText))
}

# Sort data
heroes_sent <- heroes_sent %>% .[order(heroes_sent$Sentiment %>% as.numeric()), ] %>% 
    tidyr::drop_na()
max_sent <- max(heroes_sent$Sentiment %>% as.numeric() %>% abs() + 1)

# Add hero total
heroes_ID <- unique(heroes_ID)
heroes_sent <- rbind(heroes_sent, c("average of heroes", para3 %>% as.data.frame() %>% 
    .[heroes_ID, "Score"] %>% average() %>% as.numeric()))

# Graph
heroes_sent %>% mutate(Hero = factor(Hero, levels = Hero)) %>% ggplot(aes(x = Hero, 
    y = Sentiment %>% as.numeric() %>% round(1), fill = Hero)) + geom_col() + coord_flip() + 
    Grph_theme() + ylab("Sentiment (-100% to 100%)") + xlab("") + ggtitle("Sentiment of Heroes") + 
    ylim(max_sent * -1, max_sent) + scale_fill_manual(values = c(viridis(nrow(heroes_sent) - 
    1), "black")) + guides(fill = FALSE)

Graph Sentiment of Villians

It makes sense that Tom Riddle, Professor Quirrell, and perhaps even Professor Snape contain more positive text than negative text.

Professor Umbridge, surprisingly, has the most positive sentiment. As a character, she projects a “everything is fine and I’m here to help” while she seeks to torture the main trio.

I intentionally defined “demonator” and “demonators”, separately. In the plural, the contain high sentiment. However, the most interesting takeaway is that, in the singular, a demonator has more positive than negative text around it This actually makes sense as Harry must think of positive thoughts/memories to combant the villians that are the personification of depression.

Unsurprisingly, the werewolf, ancient snake, and giant spider all have extremely negative sentiment.

# Subset text mentioning a villian
villians_sent <- data.frame(Villian = NA, Sentiment = NA)

# Find mean of sentiment scores for each villian and create ID of all villians
# for total row
villians_ID <- c()
for (string in villians) {
    findstr <- paste0("\\b", string, "\\b")
    villians_sent <- rbind(villians_sent, c(string, para3 %>% as.data.frame() %>% 
        .[grepl(findstr, CleanText), "Score"] %>% average() %>% as.numeric()))
    villians_ID <- c(villians_ID, grep(findstr, CleanText))
}

# Sort data
villians_sent <- villians_sent %>% .[order(villians_sent$Sentiment %>% as.numeric()), 
    ] %>% tidyr::drop_na()
max_sent <- max(villians_sent$Sentiment %>% as.numeric() %>% abs() + 1)

# Add hero total
villians_ID <- unique(villians_ID)
villians_sent <- rbind(villians_sent, c("average of villians", para3 %>% as.data.frame() %>% 
    .[villians_ID, "Score"] %>% average() %>% as.numeric()))

# Graph
villians_sent %>% mutate(Villian = factor(Villian, levels = Villian)) %>% ggplot(aes(x = Villian, 
    y = Sentiment %>% as.numeric() %>% round(1), fill = Villian)) + geom_col() + 
    coord_flip() + Grph_theme() + ylab("Sentiment (-100% to 100%)") + xlab("") + 
    ggtitle("Sentiment of Villians") + ylim(max_sent * -1, max_sent) + scale_fill_manual(values = c(viridis(nrow(villians_sent) - 
    1), "black")) + guides(fill = FALSE)

Proportion of Documents

Here, I want to note the prevalance of the main trio, the heroes, and the villians. The main trio is mentioned in ~85% of each paragraph triplet. The heroes outside the main trio are in less than 50% of each paragraph triplet, while villians are much more rare (less than 30%).

This demonstrates that J.K. Rowling prefers to focus on the likeable characters and keeps the overall story arch, featuring the villians, to a minimum.

docs_prop <- tibble(Proportion = c(round(length(trio_ID)/nrow(para3) * 100, 1), round(length(heroes_ID)/nrow(para3) * 
    100, 1), round(length(villians_ID)/nrow(para3) * 100, 1)), Characters = c("Main Trio", 
    "Heroes", "Villians"))

colors <- c(Heroes = "#7F0909", Villians = "#0D6217", `Main Trio` = "#AAAAAA")
ggplot(docs_prop, mapping = aes(x = Characters, y = Proportion, fill = Characters, 
    label = Proportion)) + geom_col() + geom_text(aes(label = Proportion), vjust = -0.5) + 
    ggtitle("Characters as Proportion of Documents") + Grph_theme() + scale_fill_manual(values = colors) + 
    theme(legend.position = "right") + ylim(0, 100) + labs(x = "Books", y = "Proportion (%)", 
    color = "Legend")

Sentiment of Characters Across Books

In the chart below, I show the average sentiment for the main trio, heroes, and villians as the series progresses. The main takeaways I see are:

  • Negative sentiment in the text around the villians grows substantially
  • Negative sentiment is more stable for the main trio and the heroes
  • Goblet of Fire has the largest disconnect between the text mentiong villians versus text mentioning the main trio or the heroes, where negative sentiment is high for the villians but is low relative to the other books for non-villians
  • The Chamber of Secrets is the only book where text mentioning the main trio or the heroes is more negative then text mentioning the villians–largely because of Tom Riddle’s tale likely starts with positive sentiment
  • The Deathly Hallows is the books featuring the most negative sentiment
# Main Characters Sentiment
trio_grp <- para3[trio_ID, ] %>% mutate(Negative = recode(Cross, Positive = 0, Neutral = 0, 
    Negative = 1), Neutral = recode(Cross, Positive = 0, Neutral = 1, Negative = 0)) %>% 
    group_by(Book) %>% summarize(`Trio Sentiment` = mean(Negative, na.rm = T) * 100)

# Heroes Sentiment
heroes_grp <- para3[heroes_ID, ] %>% mutate(Negative = recode(Cross, Positive = 0, 
    Neutral = 0, Negative = 1), Neutral = recode(Cross, Positive = 0, Neutral = 1, 
    Negative = 0)) %>% group_by(Book) %>% summarize(`Heroes Sentiment` = mean(Negative, 
    na.rm = T) * 100)

# Villians Sentiment
villians_grp <- para3[villians_ID, ] %>% mutate(Negative = recode(Cross, Positive = 0, 
    Neutral = 0, Negative = 1), Neutral = recode(Cross, Positive = 0, Neutral = 1, 
    Negative = 0)) %>% group_by(Book) %>% summarize(`Villians Sentiment` = mean(Negative, 
    na.rm = T) * 100)

# Plot
df <- inner_join(heroes_grp, villians_grp, by = "Book") %>% as_tibble()
df <- inner_join(trio_grp, df, by = "Book")
colors <- c(Heroes = "#7F0909", Villians = "#0D6217", `Main Trio` = "#AAAAAA")
ggplot(df, mapping = aes(x = Book, group = NA)) + geom_line(aes(y = `Trio Sentiment`, 
    color = "Main Trio"), size = 3) + geom_line(aes(y = `Heroes Sentiment`, color = "Heroes"), 
    size = 1.5) + geom_line(aes(y = `Villians Sentiment`, color = "Villians"), size = 1.5) + 
    ggtitle("Sentiment of Heroes/Villians Across Books") + Grph_theme() + scale_color_manual(values = colors) + 
    theme(axis.text.x = element_text(size = 8, angle = 45)) + theme(legend.position = "right") + 
    ylim(0, 60) + labs(x = "Books", y = "Negative Sentiment (%)", color = "Legend")

Sentiment at the Page Level

At the beginning of this document, I defined page the level as containing 250 words.

Sentiment Scores

# Sentiment Scores
average <- function(x) {
    pos <- sum(x[x > 0], na.rm = T)
    neg <- sum(x[x < 0], na.rm = T) %>% abs()
    neu <- length(x[x == 0])
    bal <- ((pos - neg)/(pos + neg)) * 100
    y <- ifelse(is.nan(bal), 0, bal %>% as.numeric())
    return(y)
}
CleanText <- basicclean(pages$Text)
pages$Score <- sapply(CleanText, function(x) getSentiment(x, dictionary = "vader", 
    score.type = average))
pages$Cross <- NA
pages$Cross <- ifelse(pages$Score < 0, "Negative", pages$Cross)
pages$Cross <- ifelse(pages$Score > 0, "Positive", pages$Cross)
pages$Cross <- ifelse(pages$Score == 0, "Neutral", pages$Cross)
cat("/n")
pasteNQ("Sentiment Levels")
propTab(pages$Cross)
[1] Set to lowercase
[1] Fixed contractions
[1] Stripped whitespace
/n[1] Sentiment Levels
Negative  Neutral Positive 
      46        0       54 

Sentiment by Page

Finally, we can view sentiment per page to see the rollercoaster of emotions throughout the books. The main takeaways that I see are:

  • Prisoner of Azkaban has a more negative final third than I remember
  • Goblet of Fire has a very happy middle, when Harry is focusing on the Tri-Wizard tournament and the Yule Ball
  • Despite containing a lot of Voldmort’s background, Half-Blood Prince is surprisingly positive until the end
  • Deathly Hallows is mostly negative throughout
  • And the most obvious trend has to be that the end of each book is negative until the final chapter, which always seems to end on a positive note–whether the heroes are victories or Harry’s friends are providing him some support over a more difficult ending
# Grouping by pages for the x-axis
pages %>% group_by(Book) %>% mutate(Paragraph = row_number()) %>% ggplot(aes(Page, 
    Score, fill = Book)) + Grph_theme() + geom_bar(alpha = 0.5, stat = "identity", 
    show.legend = FALSE) + scale_fill_manual(values = c("#946B2D", "#0D6217", "#000A90", 
    "#AAAAAA", "#000000", "#7F0909", "#FFC500")) + facet_wrap(~Book, ncol = 2, scales = "free_x")